setwd('C:\\Users\\v\\Documents\\My Documents full backup\\ODU\\Modern topics seminar\\Day 03')

library(lme4)  # load library


######################################
#GLM

#http://www.theanalysisfactor.com/glm-r-overdispersion-count-regression/
#Counts of students diagnosed as infected, number of days from initial outbreak

cases=read.csv("cases example.csv")
attach(cases)

head(cases) 

#use defaul link function (natural log)
model1 <- glm(Students ~ Days, family=poisson, data=cases)
summary(model1)

#The negative coefficient for Days indicates that as days increase, the mean number of students 
#with the disease is smaller. This coefficient is highly significant (p < 2e-16).

#If the residual deviance were greater than the degrees of freedom, 
#there would be over-dispersion. This means that there is extra variance not accounted for 
#by the model or by the error structure.  This is a very important model assumption, 
#so would have to re-fit the model using quasi poisson errors.

#allows overdispersion parameter to be estimated
model2 <- glm(Students ~ Days, family=quasipoisson, data=cases)
summary(model2)
#dispersion parameter is less than 1, so no overdispersion

#use model 1
summary(model1) #OR
model1$coefficients

#We can use the residual deviance to perform a goodness of fit test for the overall model. 
#The residual deviance is the difference between the deviance of the current model and 
#the maximum deviance of the ideal model where the predicted values are identical to the observed. 
#Therefore, if the residual difference is small enough, the goodness of fit test will 
#not be significant, indicating that the model fits the data. We conclude that the model 
#fits reasonably well because the goodness-of-fit chi-squared test is not statistically significant. 
#If the test had been statistically significant, it would indicate that the data do not fit the model well. 
#In that situation, we may try to determine if there are omitted predictor variables, 
#if our linearity assumption holds and/or if there is an issue of over-dispersion. 
with(model1, cbind(res.deviance = deviance, df = df.residual,
               p = pchisq(deviance, df.residual, lower.tail=FALSE)))

#check model with diagnostic plots
plot(model1)

Y <- predict(model1) #in log scale
Y2 <- exp(Y) #exponentiate of the fitted values to back-transform log scale to original data.
write.csv(Y2, "Y2 exp.csv")
#or use type="response"
Z <- predict(model1, type="response")

plot(Days, Students, xlab = "DAYS", ylab = "STUDENTS", pch = 16)
lines(Y2, lwd = 3, col = "blue") 
lines(Z, lwd = 3, col = "red") 

#Let's calculate the impact on the number of cases arising from a one day increase along the time axis. First we take the exponential of the coefficients.
coeffs <- exp(coef(model1))
coeffs

#We calculate the 95% confidence interval (upper and lower confidence limits) as follows:
CI <- exp(confint.default(model1))
CI

#We can calculate the change in number of students presenting with the disease for each additional day, as follows:
1 - 0.9826884
#The reduction (rate ratio) is approximately 0.02 cases for each additional day.

detach(cases)
####################################
#######################################


####GLMER
#####simple repeated measures random effects model

#Frequently we are not as interested in comparing the
#particular subjects in the study as much as we are interested
#in modeling the variability in the population from which the
#subjects were chosen.

#the individual or block which is repeatedly measured is the random effect
#nuisance variable that must be included

library(npmlreg)
library(car)
data(irlsuicide)
head(irlsuicide)
attach(irlsuicide)

#Region : 13 different health regions of Ireland.
#ID : factor with levels 1 2 3 . 12 13 corresponding to Regions.
#pop : a numeric vector giving the population size of each suppopulation group.
#death :a numeric vector giving the total number of deaths of each suppopulation group.
#sex : a factor for gender with levels 0 (female) and 1 (male).
#age : a factor for age with levels 1 (0-29), 2 (30-39), 3 (40-59), 4(60+ years).
#smr : a numeric vector with standardized mortality ratios (SMRs)
#expected : a numeric vector with `expected' number of cases obtained from a reference population (Ahlbom, 1993).

#Response is "death", a count
#examine Death by Region (the random factor)
plot(death ~ Region, data=irlsuicide, pch=16,col="lightblue")

#examine the data
hist(death)
#skewed, all positivie, Poisson distribution works

#quick tables of the data
(xtabs(death~ Region, data = irlsuicide))

#How do sex and age affect deaths, have to accout for Region
# random intercept - The intercept for Region can vary. Sex and age are fixed.
m1_res <-glmer(death ~ (1|Region) + sex + age, family= poisson, data = irlsuicide)
summary(m1_res)

#The assumption of Poisson distribution is that the variance is equal to the constant mean. 
#So the residuals have the same constant homogeneous assumption. If the data fits to a 
#Poisson distribution, then dispersion should be 1. However, dispersion can differ from 1 
#when there is a random effect or there are clusters of observations.

plot(m1_res)

#overdispersion is if its much bigger than 1
library(RVAideMemoire)
overdisp.glmer(m1_res)

#The residual plot shows the nonhomogeneous variace property of the Irish Suicide data. 
#The overdispersion calculaton also returns greater than 1.

#Therefore, further analyses such as examining over-dispersion, using other distributions, 
#centering the variables, or transformation of variables can be considered 
#for better model fitting without overdispersion problem. 

#Note:
##One ***possible*** approach is to create an observation-level random effect
#Create a new dataset with one level per observation
X=1:104
newirlsuicide=cbind(X,irlsuicide)

#include that new variable as a random effect
m2_res <-glmer(death ~ (1|Region)+(1|X) + sex + age , family= poisson, data = newirlsuicide)
summary(m2_res)

#test overdispersion again
overdisp.glmer(m2_res)

#You'd have to investigate if this were really approapriate for your issue. 
#Other approaches are possible.

detach(irlsuicide)